<?php
/**
 * discuz论坛图片采集程序
 * @version 0.01
 * @author jayeeliu#gmail.com
 */

require 'phpQuery/phpQuery.php';
$path = pathinfo(__FILE__);
$ls = directoryList($path['dirname'].'/html');
$file = array_slice($ls['html'], 220, 20);
unset($ls);
$skip = true;
foreach ($file as $path) {
	echo "$path\n";
	if (strpos($path, '1-1.html') !== false) continue;
	$path = 'html/'.$path;
	$str = file_get_contents($path);
	dimg($str, $dir = mkthreaddir($path), true);
}
//foreach ($file as $path) {
//	if ($skip === true) {
//		if(strpos($path, 'thread-357815-1-28') !== false)
//			$skip = false;
//		else
//			continue;
//	}
//	echo "$path\n";
//	if (strpos($path, '1-1.html') !== false) continue;
//	$path = 'html/'.$path;
//	$str = file_get_contents($path);
//	dimg($str, $dir = mkthreaddir($path), true);
//}
//$pages = 29;
//$p = 1;
//while(++$p <= $pages) {
//	echo "$p\n";
//	getForumAllUrl("http://web.renxingbense.org/forum-274-{$p}.html");
//}

//第一个下的所有图片	的src
//pq('div[class="t_msgfont"]:first')
function dimg($str, $dir='', $show=false) {
	$noimages = 0;
	$dom = phpQuery::newDocument($str);
	//$div = pq('div[class="t_msgfontfix"]');//7
	$div = pq('div[class="t_msgfont"]');//6.
	if(!is_dir($dir)) {
		@mkdir($dir, 0777, TRUE);
	}
//	$div[0]['img']->dump();return ;
	echo "\t--Total : ".count($div[0]['img'])." images--\n";
	foreach ($div[0]['img'] as $k=>$im) {
//		var_dump(pq($im)->attr('src'));break;
		$src = pq($im)->attr('src');
		//非附件形式
		if (stripos($src, 'http') !== false) {
			$header = @get_headers($src, 1);
			$ext = substr(strrchr($header['Content-Type'], '/'), 1);
			//images exist
			if ($header[0] == 'HTTP/1.1 200 OK' || $header[0] == 'HTTP/1.0 200 OK') {
				$show && print "\t$k is OK!\n";
				if (!file_exists("{$dir}/{$k}.{$ext}")) {
					if(strtolower(substr(PHP_OS,0,3)) !== 'win'){
						//FIXME system 
						$get = system("wget -a log.log {$src} -O {$dir}/{$k}.{$ext}");
						!$get && file_put_contents("{$dir}/missing.img.txt", "{$k}\t{$src}\n", FILE_APPEND);
					}else{
						$get = file_put_contents("{$dir}/{$k}.{$ext}", getRemoteFile($src));
						!$get && file_put_contents("{$dir}/missing.img.txt", "{$k}\t{$src}\n", FILE_APPEND);
					}
				}
			} //no images
			else {
				file_put_contents("{$dir}/missing.img.txt", "{$k}\t{$src}\n", FILE_APPEND);
				echo "\t!!!{$k} no images\n";
				if(++$noimages >= 5 && $k == $noimages-1) {
					echo "********* Too many empty files >> exit this **************\n";
					return ;
				}
			}
			sleep(rand(2,4));
		}
	}
}

/**
 * 获取论坛板块的帖子，只采集主题贴中内容
 *
 * @param 板块地址 $url
 * @param dz版本 $version
 */
function getForumAllUrl($url, $version=6) {
	echo "$url\tstarting...\n";
	$str = file_get_contents('html/'.$url);
	$pattern = '|<span id="thread_(?:\d+)">(?:.*)href="(.+)"(?:[^>]*)>([^<]*)<(?:.*)</span>|isU';
	preg_match_all($pattern, $str, $urls);
	foreach ($urls[1] as $k=>$u) {
		$pu = parse_url($url);
		$path = $pu['scheme'].'://'.$pu['host']."/{$u}";
		$dir = mkthreaddir($path);
		$html = getRemoteFile($path, true, true);
		dimg($html, $dir, true);
	}
}

function mkthreaddir($url) {
	return substr(strrchr($url, '/'), 1, -5);
}

/**
 * 获取远程文件内容
 * @param string $url		文件地址
 * @param bool	 $cookie	是否使用cookie
 * @param bool	 $filter	是否需要过滤掉html中的一些标签
 * @param bool	 $save		是否保存至本地
 * @param string $referer	设置来源
 * 
 * @return string
 */
function getRemoteFile($url, $cookie=false, $filter=false, $save=false, $referer='http://image.baidu.com/') {
	if ($cookie) {
		//TODO proxy
		require 'class-snoopy.php';
		$snoopy = new snoopy();
		$snoopy->agent = "Mozilla/5.0 (compatible; MSIE 7.0; Windows NT 6.0)";
		$snoopy->referer = $referer;
		$snoopy->rawheaders["COOKIE"] = $GLOBALS['cookie'];
		$snoopy->rawheaders["Pragma"] = "no-cache";
		$snoopy->maxredirs = 2;
		$snoopy->offsiteok = false;
		$snoopy->expandlinks = false;
		$snoopy->fetch($url);
		$str = $snoopy->results;
	} else {
		$option = array(
			'http' => array('header' => "Referer:{$referer}\nUser-Agent:Mozilla/5.0 (Windows; U; MSIE 7.0; Windows NT 6.0; en-US)")
        );
		$str = file_get_contents($url, false, stream_context_create($option));
	}
	if ($filter) {
		//过滤掉script link meta style等不必要的标签，这些标签中有些会对phpQuery有影响
		$str = preg_replace('/<head(?:.*)<\/head>/isU', '', $str);
		$str = preg_replace('/<script(?:.*)<\/script>/isU', '', $str);
		//$str = preg_replace('/<style(?:.*)<\/style>/isU', '', $str);
	}
	//TODO	获取文件名，保存
	//$save && file_put_contents('ts.htm', $str);
	return $str;
}


function directoryList($path,$windows=false){
   $slash=$windows?'\\':'/';
   $basename 	= pathinfo($path);
   $basename 	= $basename['basename'];
   $dir 		= dir($path);
   $ls			= array();
   while($item = $dir->read()){
       if(is_dir($path.$slash.$item)&& $item!="." && $item!=".."){
           $ls[$basename][]=directoryList($path.$slash.$item,$windows);
       }else{
           if($item!="."&&$item!=".."){
               $ls[$basename][]=$item;
           }
       }
   }
   return $ls;
}

?>